﻿from urllib import request
import re
class TaoBaoSpider(object):
	def __init__(self):
		self.url = "http://mm.taobao.com/json/request_top_list.htm?page="
	
	#获取网站下page页的内容
	def getPage(self,page):
		self.page = page;
		self.url = self.url + str(page)
		print(self.url)
		req = request.Request(self.url);
		req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36");
		with request.urlopen(req) as f:
			print('Status:', f.status, f.reason)
			# with open("taobao.txt","wb") as file:
				# file.write(f.read())
			return f.read().decode("gbk");
	
	#根据正则表达式获取相关需要的数据
	def getContents(self,pageIndex):
		page = self.getPage(pageIndex)
		pattern = re.compile('<div class="list-item".*?pic-word.*?<a href="(.*?)".*?<img src="(.*?)".*?<a class="lady-name.*?>(.*?)</a>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>',re.S)
		items = re.findall(pattern,page)
		for item in items:
			print(item[0],item[1],item[2],item[3],item[4])

	#写入图片
	# def saveImg(self,imageURL,fileName):
		# u = 
T = TaoBaoSpider();
print(T.getContents(1));